packages <- c("dplyr", "emmeans", "lme4", "lmerTest", "here", "ltm", "glmnet", "ggplot2", "stringi", "lubridate", "ggpattern", "tidyr")
versions <- c("1.0.8", "1.8.1.1", "1.1.28", "3.1.3", "1.0.1", "1.2.0", "4.1.4", "3.4.4", "1.7.6", "1.8.0", "1.0.1", "1.2.0")

# Install remotes if not already installed
if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")

for (i in seq_along(packages)) {
  if (!requireNamespace(packages[i], quietly = TRUE) || 
      packageVersion(packages[i]) != versions[i]) {
    remotes::install_version(packages[i], version = versions[i])
  }
  library(packages[i], character.only = TRUE)
}
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: Matrix
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
## here() starts at /Users/roundtable/Documents/github-bootstrapping
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: msm
## Loading required package: polycor
## Loaded glmnet 4.1-4
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:Matrix':
## 
##     expand, pack, unpack
# Verify package versions
installed_versions <- sapply(packages, packageVersion)
expected_versions <- setNames(versions, packages)
all_correct <- all(installed_versions == expected_versions)

if (!all_correct) {
  incorrect <- which(installed_versions != expected_versions)
  warning(paste("Some package versions do not match the expected versions:",
                paste(names(incorrect), collapse = ", ")))
}
## Warning: Some package versions do not match the expected versions: dplyr,
## emmeans, lme4, lmerTest, here, ltm, glmnet, ggplot2, stringi, lubridate,
## ggpattern, tidyr
print(here())
## [1] "/Users/roundtable/Documents/github-bootstrapping"
source(here("data_analysis", "multi-choice-utils.R"))

# Print session info for reproducibility
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.2.0     ggpattern_1.0.1 lubridate_1.8.0 stringi_1.7.6  
##  [5] ggplot2_3.4.4   glmnet_4.1-4    ltm_1.2-0       polycor_0.8-1  
##  [9] msm_1.7         MASS_7.3-54     here_1.0.1      lmerTest_3.1-3 
## [13] lme4_1.1-28     Matrix_1.3-4    emmeans_1.8.1-1 dplyr_1.0.8    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8          mvtnorm_1.1-3       lattice_0.20-45    
##  [4] assertthat_0.2.1    rprojroot_2.0.2     digest_0.6.29      
##  [7] foreach_1.5.2       utf8_1.2.2          R6_2.5.1           
## [10] evaluate_0.15       pillar_1.7.0        rlang_1.1.1        
## [13] rstudioapi_0.13     minqa_1.2.4         nloptr_2.0.0       
## [16] jquerylib_0.1.4     rmarkdown_2.13      splines_4.1.2      
## [19] stringr_1.4.0       munsell_0.5.0       compiler_4.1.2     
## [22] numDeriv_2016.8-1.1 xfun_0.30           pkgconfig_2.0.3    
## [25] shape_1.4.6         htmltools_0.5.2     tidyselect_1.1.1   
## [28] tibble_3.1.6        expm_0.999-6        codetools_0.2-18   
## [31] fansi_1.0.2         withr_2.5.1         crayon_1.5.0       
## [34] grid_4.1.2          nlme_3.1-153        jsonlite_1.7.3     
## [37] xtable_1.8-4        gtable_0.3.0        lifecycle_1.0.3    
## [40] DBI_1.1.2           magrittr_2.0.2      scales_1.2.1       
## [43] estimability_1.4.1  cli_3.6.1           remotes_2.4.2      
## [46] bslib_0.3.1         ellipsis_0.3.2      admisc_0.30        
## [49] generics_0.1.2      vctrs_0.6.4         boot_1.3-28        
## [52] iterators_1.0.14    tools_4.1.2         glue_1.6.1         
## [55] purrr_0.3.4         parallel_4.1.2      fastmap_1.1.0      
## [58] survival_3.2-13     yaml_2.3.5          colorspace_2.0-2   
## [61] knitr_1.37          sass_0.4.1
# Set up tableau color palette
tableau_palette = c("#4E79A7","#F28E2B","#E15759","#76B7B2","#59A14F","#EDC948","#B07AA1","#FF9DA7","#9C755F","#BAB0AC")
raw_experiment_data = read.csv('../experiment_data/raw_experiment_data.csv')

bad_workers = raw_experiment_data %>%
  group_by(workerId_hash) %>%
  summarize(
    n=n()
  ) %>%
  subset(n != 48)

experiment_data = raw_experiment_data %>%
  subset(!(workerId_hash %in% bad_workers$workerId_hash)) %>%
  identify_junk() %>%
  subset(is_practice == "False") %>%
  subset(passed_check == T) %>%
  mutate(
    trial_id =  paste(workerId_hash,hitId,predictor_city,sep='-'),
    year = 2023,
  ) %>%
  threshold_estimates()

bad_trials = experiment_data %>%
  group_by(trial_id) %>%
  summarize(num_temps = length(unique(temperature_estimate))) %>%
  subset(num_temps <= 2) %>%
  pull(trial_id) %>%
  unique()


experiment_data = experiment_data %>%
    mutate(
    bad_trial = trial_id %in% bad_trials,
    adjusted_temperature_estimate = case_when(
      bad_trial ~ temperature_estimate + rnorm(n(),0,0.01),
      TRUE ~ as.double(temperature_estimate)
    )
  ) %>%
  add_bootstrapping_estimates(4,F) %>%
  mutate(
      type = case_when(
        type == 'challenging' ~ 'Challenging',
        type == 'kind' ~ 'Kind',
        type == 'wicked' ~ 'Wicked'
      ),
      type = factor(type,levels=c('Kind','Challenging','Wicked'))
  )


kind_cities = c('Baltimore', 'Charlotte', 'Denver', 'Orlando', 'Portland', 'Sacramento', 'San Antonio', 'St. Louis')
challenging_cities = c('Cairo', 'Delhi', 'Lagos', 'London', 'Mexico City', 'Paris', 'Tokyo', 'Toronto')
wicked_cities = c('Auckland', 'Buenos Aires', 'Johannesburg', 'Lima', 'Luanda', 'Santiago','Sao Paulo', 'Sydney')
all_cities = c(kind_cities,challenging_cities,wicked_cities)
experiment_data$ordered_predictor_city = factor(experiment_data$predictor_city,levels=all_cities)
experiment_data$pretty_condition = ifelse(experiment_data$condition=='forced-no-model','Control','Model assistance')
# Estimate abilities from IRT model and add to experiment_data

# Prep worker data and irt data

# Add worker data to 
worker_data = read.csv("../experiment_data/worker_data.csv")

worker_data = worker_data %>%
  subset(workerId_hash %in% unique(experiment_data$workerId_hash))

irt_data = dplyr::select(worker_data,matches("correct"))
irt_data = irt_data[,order(colnames(irt_data))]
irt_data = data.frame(sapply(irt_data, \(x) +as.logical(x)))

# Convert to 0/1
#irt_model = ltm(irt_data ~ z1)
irt_model = tpm(irt_data) # Three-parameter model
# irt_model_og = ltm(irt_data ~ z1)
# Correlate scores from both models...

# og_ability = factor.scores(irt_model_og, resp.patterns = irt_data)$score.dat$z1
# new_ability =  factor.scores(irt_model, resp.patterns = irt_data)$score.dat$z1

# plot(og_ability,new_ability)

worker_data$ability = factor.scores(irt_model, resp.patterns = irt_data)$score.dat$z1
worker_data$num_correct = rowSums(irt_data)

# Merge worker data with experiment_test
experiment_data = experiment_data %>%
  merge(worker_data,by='workerId_hash',all.x=T)
# Overall error rates: bar plot
overall_errors = experiment_data %>%
  group_by(pretty_condition) %>%
  summarize(
    average_error = mean(abs_error),
    error_se = se(abs_error,n()) 
  ) %>%
  ggplot(aes(x=pretty_condition,y=average_error,fill=pretty_condition)) +
  geom_bar(stat='identity') +
  geom_errorbar(aes(ymin=average_error - error_se,ymax = average_error+error_se),
                size=0.7,width=0.4) +
  labs(
    x='Condition',
    y='Mean absolute error',
  ) + 
  scale_y_continuous(labels = function(y) paste0(y, "°"), expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values = tableau_palette) +
  theme_classic() +
  theme(
    legend.position = 'none',
    plot.title = element_text(hjust = 0.5),
        axis.title.x=element_blank(),
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
overall_errors

# Look at overall error rates by type: bar plot
errors_by_type = experiment_data %>%
  group_by(pretty_condition,type) %>%
  summarize(
    average_error = mean(abs_error),
    error_se = se(abs_error,n()) 
  ) %>%
  ggplot(aes(x=type,y=average_error,fill=pretty_condition)) +
  geom_bar(stat='identity',position=position_dodge(0.9)) +
  geom_errorbar(aes(ymin=average_error - error_se,ymax = average_error+error_se),
                size=0.7,width=0.45,position=position_dodge(0.9)) +
  labs(
    x='Trial difficulty',
    y='Mean absolute error',
    fill = "Condition"
  ) +
  scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values = tableau_palette) +
  theme_classic() +
  theme(axis.title.x=element_blank())
## `summarise()` has grouped output by 'pretty_condition'. You can override using
## the `.groups` argument.
errors_by_type

city_data = experiment_data %>%
  group_by(pretty_condition,type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(abs_error),
    average_error_se = se(abs_error,n())
  ) 
## `summarise()` has grouped output by 'pretty_condition', 'type'. You can
## override using the `.groups` argument.
average_error_by_city = city_data %>%
  ggplot(aes(x=pretty_condition,y=average_error,color=pretty_condition,shape=pretty_condition)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=average_error-average_error_se,ymax=average_error+average_error_se),
                width=0.45,size=0.7) +
  facet_wrap(. ~ ordered_predictor_city, ncol=8) +
  scale_color_manual(values = tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) + 
  labs(
    y='Mean absolute error',
    color='Condition',
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position='bottom'
    )

average_error_by_city

# Spaghetti plots: All passed check
predictor_city_highs = get_city_averages(unique(subset(experiment_data,model_assistance == "True")$predictor_city))

most_popular_model_city_highs = get_most_popular_model_cities(experiment_data) %>%
  mutate(
    city = factor(predictor_city,levels=all_cities),
    most_popular_model_city = case_when(
      most_popular_model_city == 'San Francisco' ~ 'SF',
      most_popular_model_city == 'New York' ~ 'NYC',
      most_popular_model_city == 'Washington' ~ 'DC',
      most_popular_model_city == 'Los Angeles' ~ 'LA',
      most_popular_model_city == 'Philadelphia' ~ 'Philly',
      most_popular_model_city == 'Minneapolis' ~ 'MPLS',
      T ~ most_popular_model_city
    )
  )
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
## Joining, by = "city"
best_model_city_highs = get_best_model_cities(experiment_data) %>%
  mutate(
    city = factor(predictor_city,levels=all_cities),
    best_model_city = case_when(
      best_model_city == 'San Francisco' ~ 'SF',
      best_model_city == 'New York' ~ 'NYC',
      best_model_city == 'Washington' ~ 'DC',
      best_model_city == 'Los Angeles' ~ 'LA',
      best_model_city == 'Philadelphia' ~ 'Philly',
      best_model_city == 'Minneapolis' ~ 'MPLS',
      T ~ best_model_city
    )
  )
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'predictor_city'. You can override using
## the `.groups` argument.
chosen_cities_spaghetti = predictor_city_highs %>%
  mutate(city = factor(city,levels=all_cities)) %>%
  ggplot(aes(x=month,y=average_hi)) +
  geom_line(aes(colour = "Target"), size=1,alpha=0.55) + # Removed alpha=0.5 here
  geom_line(data=most_popular_model_city_highs, aes(x=month,y=model_city_modeled_high, colour = "Most chosen"), 
            size=1, alpha=0.9) + # Kept alpha=0.5 for "Most chosen"
  geom_text(data=subset(most_popular_model_city_highs,month==2),
            aes(x=7.2,y=48,label=most_popular_model_city),
            color='#E15759') +
  geom_line(data=best_model_city_highs, aes(x=month,y=best_model_predicted_hi, colour = "Lowest mean error"), 
            size=1, alpha=0.55) + # Changed alpha for "Lowest mean error" to 0.2 (or whatever value you prefer)
  geom_text(data=subset(best_model_city_highs,month==10),
            aes(x=7.2,y=38,label=best_model_city),
            color='#76B7B2')+
  scale_x_continuous(breaks=1:12) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) +
  facet_wrap(.~city,ncol=8) +
  labs(y='Average high', x='Month') + 
 scale_colour_manual(
    values = c("Target" = "black", 
               "Most chosen" = "#E15759", 
               "Lowest mean error" = "#76B7B2"),
    breaks = c("Target", "Most chosen", "Lowest mean error"), 
    guide = guide_legend(override.aes = list(alpha = 0.55)),
    name = "City type"
 ) +
  theme(
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(),
    axis.title.x=element_blank(),
    legend.position = 'bottom'
  )


chosen_cities_spaghetti

# Plot overall bootstrapping and human errors

human_forecasts = experiment_data %>%
  transmute(
    pretty_condition ,
    error = abs_error,
    type,
    ordered_predictor_city,
    predictor_type = 'Human'
  )

bootstrap_forecasts = experiment_data %>%
  transmute(
    pretty_condition,
    error = bootstrap_error,
    type,
    ordered_predictor_city,
    predictor_type = 'Bootstrap'
  )

errors_df = rbind(human_forecasts,bootstrap_forecasts) %>%
  mutate(predictor_type = factor(predictor_type,levels=c('Human','Bootstrap')))

full_results_plot = errors_df %>%
  group_by(pretty_condition,predictor_type,type) %>%
  summarize(
    average_error = mean(error),
    error_se = se(error,n()),
    .groups = "drop"
  ) %>%
  ggplot(aes(x=pretty_condition, y=average_error, fill=pretty_condition, pattern=predictor_type)) +
  
  # Using geom_bar_pattern for patterned bars
  geom_bar_pattern(stat='identity', position=position_dodge(0.9),
                   pattern_density = 0.1,
                   pattern_spacing = 0.015,
                   pattern_fill="black",
                   pattern_angle = 45,
                   pattern_alpha = 0.5) +
  
  geom_errorbar(aes(ymin=average_error - error_se, ymax=average_error+error_se),
                size=0.7, width=0.4, position=position_dodge(0.9)) +
  labs(
    x = 'Model assistance',
    y = 'Mean absolute error',
    fill = 'Predictor type',
    pattern = "Predictor type"
  ) +
  scale_pattern_manual(values=c(Bootstrap = "stripe", Human = "none")) + 
  scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values=tableau_palette) +
  facet_grid(cols=vars(type)) +
  theme_classic() +
  theme(
    axis.title.x=element_blank(),
    strip.background = element_blank()
  )


full_results_plot

model_city_averages = unique(subset(experiment_data,model_assistance == 'True')$model_city) %>%
    get_city_averages() %>%
    get_model_predicted_highs()
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
experiment_data_2 = experiment_data %>%
  left_join(model_city_averages, by = c("model_city" = "city", "month" = "month")) %>%
  mutate(model_city_model_high = ifelse(is.na(model_city), NA, model_predicted_hi)) %>%
  mutate(
    model_city_error = abs(model_city_model_high - average_hi)
  )

grouped_data = experiment_data_2 %>%
  subset(model_assistance=='True') %>%
  group_by(type) %>%
  summarize(
    average_human_error = mean(abs_error),
    se_human_error = se(abs_error,n()),  # compute standard error for human
    average_model_error = mean(model_city_error),
    se_model_error = se(model_city_error,n()),  # compute standard error for human
  )

# Reshape data
df_long = grouped_data %>%
  pivot_longer(
    cols = c(starts_with("average"), starts_with("se")),
    names_to = c(".value", "method"),
    names_pattern = "(.+)_(.*)_error"
  )
# Change the method names to "human" and "model"
df_long$method <- ifelse(df_long$method == "human", "Human", "Chosen cities")

# Plot
humans_vs_cities = df_long %>%
  mutate(method = factor(method,levels=c('Human','Chosen cities'))) %>%
  ggplot(aes(x = type, y = average, fill = method)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_errorbar(aes(ymin=average - se,ymax = average+se),
                size=0.7,width=0.45,position=position_dodge(0.9)) +
    scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,18)) +
    scale_fill_manual(values=c("#B07AA1","#BAB0AC")) + 
    theme_classic() +
    theme(axis.title.x=element_blank()) +
    labs(y = "Mean absolute error", x = "Type", fill = "Predictor type")

humans_vs_cities

top_10 = quantile(subset(experiment_data,model_assistance=="True")$ability,0.9)

grouped_data_expert = experiment_data_2 %>%
  subset(ability > top_10) %>%
  subset(model_assistance=='True') %>%
  group_by(type) %>%
  summarize(
    average_human_error = mean(abs_error),
    se_human_error = se(abs_error,n()),  # compute standard error for human
    average_model_error = mean(model_city_error),
    se_model_error = se(model_city_error,n()),  # compute standard error for human
  )

# Reshape data
df_long_expert = grouped_data_expert %>%
  pivot_longer(
    cols = c(starts_with("average"), starts_with("se")),
    names_to = c(".value", "method"),
    names_pattern = "(.+)_(.*)_error"
  )
# Change the method names to "human" and "model"
df_long_expert$method <- ifelse(df_long_expert$method == "human", "Experts", "Chosen cities")

# Plot
experts_vs_cities = df_long_expert %>%
  mutate(method = factor(method,levels=c('Experts',"Chosen cities"))) %>%
  ggplot(aes(x = type, y = average, fill = method)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_errorbar(aes(ymin=average - se,ymax = average+se),
                size=0.7,width=0.45,position=position_dodge(0.9)) +
    scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,18)) +
    scale_fill_manual(values=c("#B07AA1","#BAB0AC")) + 
    theme_classic() +
    theme(axis.title.x=element_blank()) +
    labs(y = "Mean absolute error", x = "Type", fill = "Predictor type")

experts_vs_cities

# Plot ability on x axis, and average error on y axis
ability_plot = experiment_data %>%
  group_by(workerId_hash,pretty_condition,type) %>%
  summarize(
    ability = mean(ability),
    average_error = mean(abs_error)
  ) %>%
  ggplot(aes(x=ability,y=average_error,color=pretty_condition)) +
  geom_point(alpha=0.1) +
  scale_color_manual(values=tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°"))+
  #geom_line(alpha=0.75,size=1.75) +
  geom_smooth(method = "lm", se = FALSE,linetype = 1, size = 1.75, alpha = 0.08) +
  labs(
    x = 'Ability',
    y='Mean absolute error',
    color = 'Condition'
  ) +
  theme_classic() +
  theme(
    strip.background = element_blank()
  ) +
  facet_grid(cols=vars(type))
## `summarise()` has grouped output by 'workerId_hash', 'pretty_condition'. You
## can override using the `.groups` argument.
ability_plot
## `geom_smooth()` using formula = 'y ~ x'

# Add preceeding and trailing month
experiment_data = experiment_data %>%
  mutate(
    preceding_month = ifelse(month==1,12,month-1),
    trailing_month = ifelse(month==12,1,month+1)
  )

# Add estimate for preceeding month to experiment_data
experiment_data = experiment_data %>%
  transmute(
    trial_id,
    preceeding_estimate = temperature_estimate,
    month = preceding_month
  ) %>%
  right_join(experiment_data,by=c('trial_id','month'))

# Add estimate for trailing month to experiment_data
experiment_data = experiment_data %>%
  transmute(
    trial_id,
    trailing_estimate = temperature_estimate,
    month = trailing_month
  ) %>%
  right_join(experiment_data,by=c('trial_id','month'))

experiment_data = experiment_data %>%
  mutate(
    interpolated_estimate = round((preceeding_estimate + trailing_estimate)/2),
    interpolated_error = abs(interpolated_estimate - average_hi)
  )

grouped_data = experiment_data %>%
  group_by(pretty_condition,type) %>%
  summarize(
    average_human_error = mean(abs_error),
    average_interpolated_error = mean(interpolated_error),
    se_human_error = se(abs_error,n()),
    se_interpolated_error = se(interpolated_error,n())
  )
## `summarise()` has grouped output by 'pretty_condition'. You can override using
## the `.groups` argument.
library(stringr)

# Reshape the data to long format
long_data <- grouped_data %>%
  pivot_longer(cols = starts_with("average_"),
               names_to = "predictor_type",
               values_to = "error") %>%
  pivot_longer(cols = starts_with("se_"),
               names_to = "se_type",
               values_to = "se")

# Separate the predictor types
long_data <- long_data %>%
  mutate(predictor_type = ifelse(str_detect(predictor_type, "human"), "Human", "Interpolated"),
         se_type = ifelse(str_detect(se_type, "human"), "Human", "Interpolated"))

# Filter out rows where predictor_type and se_type don't match (to ensure correct pairs)
long_data <- long_data %>% 
  filter(predictor_type == se_type)

interpolated_errors = errors_df %>%
  subset(predictor_type!='Human') %>%
  group_by(pretty_condition,type) %>%
  summarize(error = mean(error)) %>%
  mutate(predictor_type = 'Interpolated')
## `summarise()` has grouped output by 'pretty_condition'. You can override using
## the `.groups` argument.
interpolation_plot = long_data %>%
  ggplot(aes(x=pretty_condition,
             y=error, fill=pretty_condition, pattern=predictor_type)) +
  geom_bar_pattern(stat='identity', position=position_dodge(0.9),
                   pattern_density = 0.3,
                   pattern_spacing = 0.015,
                   pattern_fill="black",
                   pattern_angle = 45,
                   pattern_alpha = 0.6) +
  geom_errorbar(aes(ymin=error - se, ymax=error+se),
                size=0.7, width=0.4, position=position_dodge(0.9)) +
  labs(
    x = 'Model assistance',
    y = 'Mean absolute error',
    fill = 'Predictor type',
    pattern = "Predictor type"
  ) +
  scale_pattern_manual(values=c(Interpolated = "circle", Human = "none")) + 
  scale_y_continuous(labels = function(y) paste0(y, "°"),expand = c(0, 0), limits=c(0,19)) +
  scale_fill_manual(values=tableau_palette) +
  facet_grid(cols=vars(type)) + 
  geom_point(data=interpolated_errors,shape=18,size=3)+
  theme_classic() +
  theme(
    axis.title.x=element_blank(),
    strip.background = element_blank()
  )

interpolation_plot

human_errors = experiment_data %>%
  group_by(type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(abs_error),
    average_error_se = se(abs_error,n()),
    type = 'Human'
  )
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
bootstrap_errors = experiment_data %>%
  group_by(type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(bootstrap_error),
    average_error_se = se(bootstrap_error,n()),
    type = 'Bootstrap'
  )
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
bootstrap_by_city = human_errors %>%
  rbind(bootstrap_errors) %>%
  mutate(type = factor(type,levels=c('Human', 'Bootstrap'))) %>%
  ggplot(aes(x=type,y=average_error,shape=type)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=average_error-average_error_se,ymax=average_error+average_error_se),
                width=0.45,size=0.7) +
  facet_wrap(. ~ ordered_predictor_city, ncol=8) +
  scale_color_manual(values = tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) + 
  labs(
    y='Mean absolute error',
    color='Condition',
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position='bottom'
    )

average_error_by_city

city_data = experiment_data %>%
  group_by(pretty_condition,type,ordered_predictor_city) %>%
  summarize(
    average_error = mean(bootstrap_error),
    average_error_se = se(bootstrap_error,n())
  ) 
## `summarise()` has grouped output by 'pretty_condition', 'type'. You can
## override using the `.groups` argument.
average_bootstrap_error_by_city = city_data %>%
  ggplot(aes(x=pretty_condition,y=average_error,color=pretty_condition,shape=pretty_condition)) +
  geom_point(size=2) +
  geom_errorbar(aes(ymin=average_error-average_error_se,ymax=average_error+average_error_se),
                width=0.45,size=0.7) +
  facet_wrap(. ~ ordered_predictor_city, ncol=8) +
  scale_color_manual(values = tableau_palette) +
  scale_y_continuous(labels = function(y) paste0(y, "°")) + 
  labs(
    y='Mean absolute error',
    color='Condition',
  ) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.x = element_blank(),
    legend.position='bottom'
    )

average_bootstrap_error_by_city